library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## -- Conflicts ---------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(forecast)
## Warning: package 'forecast' was built under R version 3.6.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(fpp))
## Warning: package 'fpp' was built under R version 3.6.3
## Warning: package 'fma' was built under R version 3.6.3
## Warning: package 'expsmooth' was built under R version 3.6.3
## Warning: package 'lmtest' was built under R version 3.6.3
## Warning: package 'zoo' was built under R version 3.6.3
## Warning: package 'tseries' was built under R version 3.6.3
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(hrbrthemes))
## Warning: package 'hrbrthemes' was built under R version 3.6.3
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(viridis))
## Warning: package 'viridis' was built under R version 3.6.3
suppressPackageStartupMessages(library(plotly))
## Warning: package 'plotly' was built under R version 3.6.3
suppressPackageStartupMessages(library(gapminder))
## Warning: package 'gapminder' was built under R version 3.6.3
suppressPackageStartupMessages(library(htmlwidgets))
## Warning: package 'htmlwidgets' was built under R version 3.6.3
suppressPackageStartupMessages(library(emojifont))
## Warning: package 'emojifont' was built under R version 3.6.3
suppressPackageStartupMessages(library(wordcloud2))
## Warning: package 'wordcloud2' was built under R version 3.6.3
options(warn = -1)
setwd("C:/Users/skakar/Desktop/PERSONAL/Blog")

#Import Data and Convert to Text  
whatsapp_txt <- readLines("WhatsApp Chat with Mom.txt")
wa_data = as.data.frame(whatsapp_txt)
wa_data = wa_data %>% rename (Texts = whatsapp_txt)

# Add Index to the Document
wa_data <- tibble::rowid_to_column(wa_data, "ID")

# Regex 
dates = ""
times = ""
sender = ""
message = ""
wa_data$Texts = as.character(wa_data$Texts)


# Loop uses Regex to get Data from .TXT file
for ( i in 1:nrow(wa_data) ){
  hasText       = str_extract(wa_data[i,2], "(.*):(.*)")
  hasDate       = str_extract(wa_data[i,2], "\\d+/\\d+/\\d+")
  if(!is_empty(hasText) && !is.na(hasDate))
  {
    dates[i]   = str_extract(wa_data[i,2], "\\d+/\\d+/\\d+")
    times[i]   = str_extract(wa_data[i,2], "\\d+:\\d(.*)[AM]")
    sender[i]  = str_remove(substring(str_extract(wa_data[i,2], "[AM - ](.*)[:]"), 12), ":" )
    message[i] = unlist(strsplit(str_extract(wa_data[i,2], ":(.*)"), ": "))[2]
  }
  else
  {
    wa_data = wa_data[-c(i),]
  }
}

# Append Data to Dataset
wa_data$Date     = dates
wa_data$Time     = times
wa_data$Time     = trimws(str_trunc(wa_data$Time, 8, "right", ellipsis = "")) 
wa_data$Sender   = trimws(sender)
wa_data$Message  = message

# Convert to Date Type
wa_data$Datetime <- paste(wa_data$Date, wa_data$Time)
wa_data$Datetime <- as.POSIXct(wa_data$Datetime, format = "%m/%d/%Y %I:%M %p", tz = "GMT")
year(wa_data$Datetime) <- year(wa_data$Datetime) + 2000
wa_data$Year     = year(wa_data$Datetime)
wa_data$Month    = as.factor(months(wa_data$Datetime))
wa_data$Month = factor(wa_data$Month,levels=c("January","February","March", "April","May","June","July","August","September", "October","November","December"),ordered=TRUE)

# MetaData
str(wa_data)
## 'data.frame':    4323 obs. of  9 variables:
##  $ ID      : int  1 2 3 4 6 8 9 10 11 12 ...
##  $ Texts   : chr  "7/22/17, 4:05 PM - Messages to this chat and calls are now secured with end-to-end encryption. Tap for more info." "7/26/17, 8:45 AM - Mom: Bread, butter, eggs, oil, potatoes,salt,sugar,cereals,glass,Knife,fruits,vegetables,onions," "7/26/17, 11:02 AM - Mom: <Media omitted>" "7/26/17, 11:04 AM - Mom: Rohit Rana" ...
##  $ Date    : chr  "7/22/17" "7/26/17" "7/26/17" "7/26/17" ...
##  $ Time    : chr  "4:05 PM" "8:45 AM" "11:02 AM" "11:04 AM" ...
##  $ Sender  : chr  "" "Mom" "Mom" "Mom" ...
##  $ Message : chr  NA "Bread, butter, eggs, oil, potatoes,salt,sugar,cereals,glass,Knife,fruits,vegetables,onions," "<Media omitted>" "Rohit Rana" ...
##  $ Datetime: POSIXct, format: "2017-07-22 16:05:00" "2017-07-26 08:45:00" ...
##  $ Year    : num  2017 2017 2017 2017 NA ...
##  $ Month   : Ord.factor w/ 12 levels "January"<"February"<..: 7 7 7 7 NA NA 7 7 7 8 ...
# DATA
head(wa_data)
##   ID
## 1  1
## 2  2
## 3  3
## 4  4
## 6  6
## 8  8
##                                                                                                                 Texts
## 1   7/22/17, 4:05 PM - Messages to this chat and calls are now secured with end-to-end encryption. Tap for more info.
## 2 7/26/17, 8:45 AM - Mom: Bread, butter, eggs, oil, potatoes,salt,sugar,cereals,glass,Knife,fruits,vegetables,onions,
## 3                                                                            7/26/17, 11:02 AM - Mom: <Media omitted>
## 4                                                                                 7/26/17, 11:04 AM - Mom: Rohit Rana
## 6                                                                                             Overland Park, KS 66223
## 8                                                                7/26/17, 11:04 AM - Mom: This is rohit fanaa address
##      Date     Time Sender
## 1 7/22/17  4:05 PM       
## 2 7/26/17  8:45 AM    Mom
## 3 7/26/17 11:02 AM    Mom
## 4 7/26/17 11:04 AM    Mom
## 6    <NA>     <NA>   <NA>
## 8    <NA>     <NA>   <NA>
##                                                                                       Message
## 1                                                                                        <NA>
## 2 Bread, butter, eggs, oil, potatoes,salt,sugar,cereals,glass,Knife,fruits,vegetables,onions,
## 3                                                                             <Media omitted>
## 4                                                                                  Rohit Rana
## 6                                                                                        <NA>
## 8                                                                                        <NA>
##              Datetime Year Month
## 1 2017-07-22 16:05:00 2017  July
## 2 2017-07-26 08:45:00 2017  July
## 3 2017-07-26 11:02:00 2017  July
## 4 2017-07-26 11:04:00 2017  July
## 6                <NA>   NA  <NA>
## 8                <NA>   NA  <NA>
# Data Analysis
print(mean(sapply(strsplit(wa_data$Message, " "), length))) # Average Number of Words per message
## [1] 3.960675
# Chat Distribution
barplot(table(subset(wa_data$Sender, wa_data$Sender %in% as.array(c("Mom","Sunny Kakar")))), main="Chat Distribution",
        xlab="Senders", ylab="Message Count", col = c("#56B4E9", "#009E73"))

# Chat Consistency Through Time

PlotData <- function(data, yearid){
  year_data  = data %>% filter( year(Datetime) == yearid )
  
  message =  paste("Message Distribution: ",as.character(yearid))
  
  year_data %>%
    ggplot( aes(x=Datetime)) +
    geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8) +
    ggtitle(message) +
    theme_ipsum()
}

PlotData(wa_data, "2017")

PlotData(wa_data, "2018")

PlotData(wa_data, "2019")

PlotData(wa_data, "2020")

# Conversation
plot2 = ggplot(data=subset(wa_data, !is.na(Month)), aes(x=Month, group=Year, fill=Year)) +
  ggtitle("Message Count Density (Monthly)") +
  geom_histogram(stat="count") +
  theme_ipsum()

ggplotly(plot2)
# Scatter Plot 
texts_me   = table(format(subset(wa_data$Datetime, wa_data$Sender %in% as.array(c("Sunny Kakar"))),"%Y-%m"))
scatter    = as.data.frame(texts_me)
scatter = scatter %>% rename (Text_Frequency_Me = Freq)
scatter = scatter %>% rename (MYTime = Var1)
scatter$Text_Frequency_Them = table(format(subset(wa_data$Datetime, wa_data$Sender %in% as.array(c("Mom"))),"%Y-%m"))

ggplot(scatter, aes(x=scatter$Text_Frequency_Me, y=scatter$Text_Frequency_Them)) +
  geom_point() +
  geom_smooth(method=lm , color="green", se=FALSE) +
  geom_rug(col="steelblue",alpha=0.1, size=1.5) +
  theme_ipsum() +
  labs(x = "My Text Frequency", y = "Their Text Frequency", title = "Scatterplot (Corr. b/w Texting Frequency)")
## Don't know how to automatically pick scale for object of type table. Defaulting to continuous.
## `geom_smooth()` using formula 'y ~ x'

# Most Common Texts sent
# -- My Top Texts
my_top_texts  =  as.data.frame(sort(table(subset(wa_data$Message, wa_data$Sender %in% as.array(c("Sunny Kakar")))), decreasing=T)[1:10])
my_top_texts
##                Var1 Freq
## 1   <Media omitted>  123
## 2             Hanji   65
## 3          ðŸ‘\215ðŸ\217¼   29
## 4                Ok   27
## 5  Good morning Mom   18
## 6               Mom   18
## 7               Yep   18
## 8              Okay   15
## 9           Morning   10
## 10            Okay.   10
# -- Their Top Texts
their_top_texts  =  as.data.frame(sort(table(subset(wa_data$Message, wa_data$Sender %in% as.array(c("Mom")))), decreasing=T)[1:10])
their_top_texts
##                 Var1 Freq
## 1  Missed video call  647
## 2  Missed voice call  253
## 3    <Media omitted>  225
## 4                 Ok   93
## 5  Good Morning beta   71
## 6               ðŸ‘\215   55
## 7               ðŸ\230\230   23
## 8           R u busy   19
## 9          Where r u   19
## 10      R u in class   17
wordcloud2(data=sort(table(subset(wa_data$Message, wa_data$Sender %in% as.array(c("Sunny Kakar")))), decreasing=T)[1:50], size=1.6)
wordcloud2(data=sort(table(subset(wa_data$Message, wa_data$Sender %in% as.array(c("Mom")))), decreasing=T)[1:50], size=10)
print("Thank you!")
## [1] "Thank you!"